import torch
from PIL import Image
import gradio as gr
import spaces
from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer
import os
from threading import Thread


HF_TOKEN = os.environ.get("HF_TOKEN", None)
MODEL_ID = "THUDM/glm-4-9b-chat"
MODEL_ID2 = "THUDM/glm-4-9b-chat-1m"
MODELS = "hf-models/glm-4-9b-chat"
MODEL_NAME = MODELS.split("/")[-1]

TITLE = "<h1><center>Gitee AI GLM-4-9B</center></h1>"

DESCRIPTION = ''

CSS = """
.duplicate-button {
  margin: auto !important;
  color: white !important;
  background: black !important;
  border-radius: 100vh !important;
}
"""

model = AutoModelForCausalLM.from_pretrained(
        MODELS,
        torch_dtype=torch.bfloat16,
        low_cpu_mem_usage=True,
        trust_remote_code=True,
        ).to(0).eval()

tokenizer = AutoTokenizer.from_pretrained(MODELS,trust_remote_code=True)


def stream_chat(message: str, history: list, temperature: float, max_length: int):
    print(f'message is - {message}')
    conversation = []
    for prompt, answer in history:
        conversation.extend([{"role": "user", "content": prompt}, {"role": "assistant", "content": answer}])
    conversation.append({"role": "user", "content": message})

    print(f"Conversation is -\n{conversation}")
    
    input_ids = tokenizer.apply_chat_template(conversation, tokenize=True, add_generation_prompt=True, return_tensors="pt", return_dict=True).to(model.device)
    streamer = TextIteratorStreamer(tokenizer, timeout=60.0, skip_prompt=True, skip_special_tokens=True)

    generate_kwargs = dict(
        max_length=max_length,
        streamer=streamer,
        do_sample=True,
        top_k=1,
        temperature=temperature,
        repetition_penalty=1.2,
    )
    gen_kwargs = {**input_ids, **generate_kwargs}

    with torch.no_grad():
        thread = Thread(target=model.generate, kwargs=gen_kwargs)
        thread.start()
        buffer = ""
        for new_text in streamer:
            buffer += new_text
            yield buffer
 



chatbot = gr.Chatbot(height=450)

with gr.Blocks(css=CSS) as demo:
    gr.HTML(TITLE)
    gr.HTML(DESCRIPTION)
    gr.ChatInterface(
        fn=stream_chat,
        chatbot=chatbot,
        fill_height=True,
        additional_inputs_accordion=gr.Accordion(label="⚙️ Parameters", open=False, render=False),
        additional_inputs=[
            gr.Slider(
                minimum=0,
                maximum=1,
                step=0.1,
                value=0.8,
                label="Temperature",
                render=False,
            ),
            gr.Slider(
                minimum=128,
                maximum=8192,
                step=1,
                value=1024,
                label="Max Length",
                render=False,
            ),
        ],
        examples=[
            ["鲁迅和周树人什么关系"],
            ["从前有一头牛，这头牛后面有什么？"],
            ["以红楼梦的行文风格写一张委婉的请假条。不少于320字。"],
            ["人生的意义是什么"],
            ["js 实现python 的 range(10)"],
            ["我的蓝牙耳机坏了，我该去看牙科还是耳鼻喉科？"],
            ["鸡和兔子同笼，头共 10，脚共28，鸡兔子各有几只？"],
        ],
        cache_examples=False,
    )


if __name__ == "__main__":
    demo.queue("auto")
    demo.launch(debug=True)
